library(tidyverse)
library(lubridate)
Data downloaded from https://docs.google.com/spreadsheets/d/1lgR-UpR9v9lmtoeK1ZhlVfs0FBj1xXz_eImhcrSI4Q0/
accidents_raw <- read_csv("data/Car_Accidents_2014_Bogota.csv")
Parsed with column specification:
cols(
the_geom = col_character(),
cartodb_id = col_integer(),
codigo_accidente = col_integer(),
fecha_ocurrencia = col_date(format = ""),
hora_ocurrencia = col_character(),
gravedad = col_character(),
objectid = col_integer(),
localidad = col_character(),
municipio = col_character(),
x = col_double(),
y = col_double(),
timedate = col_datetime(format = ""),
clase = col_character()
)
accidents_raw
the_geom cartodb_id codigo_accidente fecha_ocurrencia hora_occurencia gravedad objectid localidad municipio x y timedate <S3: POSIXct> 2014-10-29 14:10:00clase # The data is for one year (2014). If we want to filter by month (etc), need a `month` variable first. Could use **lubridate**, or `separate()` ...The former avoids 10/11/12 coming before 2/3/4/etc, but the latter should also be ok in plots if using integers
?separate
accidents <- accidents_raw %>%
separate(fecha_ocurrencia, c("year", "month", "day"), "-", convert = TRUE) %>%
# `convert` gets y&m&d as integers
filter(year == 2014) # 17 rows from 2013 filtered out
accidents <- accidents %>%
separate(hora_ocurrencia, c("hour", "minute", "second"), ":", convert = TRUE) %>%
filter(hour != "1899-12-30") # 9 rows filtered out
Too few values at 9 locations: 323, 773, 3594, 4071, 5378, 6861, 7874, 8184, 9044
accidents # (9973 rows x 17 cols)
# Just to get a sense of what's going on
accidents %>% count(the_geom) # 6921 rows (so not unique)
accidents %>% count(the_geom) %>% count(n) # 21 rows (1 to 22, without 18)
accidents %>% count(cartodb_id) # 9973 rows (confirmed as unique)
accidents %>% count(codigo_accidente) # 9973 rows (unique). Seem to be 5e+05 or 4e+06
accidents %>% count(month) # 12 rows. `n` is 485-1584. Higher in Oct/Nov/Dec
accidents %>% count(hour) # 24 rows. `n` is 51-722 (2am, 2pm)
accidents %>% count(gravedad) # 3 rows. "CON HERIDOS", "CON MUERTOS", "SOLO DANOS"
accidents %>% count(objectid) # 9973 rows (confirmed as unique)
accidents %>% count(localidad) # 19 rows. `n` is 52-1176
accidents %>% count(municipio) # 1 row ("BOGOTA"). `n` is 9973
accidents %>% count(clase) # 7 rows. `n` is 1 ("INCENDIO") to 8494 ("CHOQUE")
Initial data visialisation, to see if there might be a location/ time pattern
accidents <- accidents %>%
mutate(date = make_date(year, month, day), week_no = week(date), week_day = wday(date, label = TRUE), hour = as.numeric(hour), n = n())
accidents
ggplot(accidents, aes(date)) +
geom_histogram() +
facet_wrap(~ localidad)
ggplot(accidents, aes(x, y, colour = localidad)) +
geom_point()
Maybe look at a time-based model (especially if there is data for other years too, in which case start with localidad=ENGATIVA). And/or a visualisation (like ACotgreave? if I can get week#). Or time of day (model/viz). Or connected to streetlights (if there’s a joining field)
accidents2 <- accidents %>%
group_by(week_no, week_day) %>%
summarise(n = n())
accidents2
ggplot(accidents2, aes(week_day, week_no)) +
geom_point(aes(colour = n, size = n, alpha = 0.5))
ggplot(accidents2, aes(week_no, week_day)) +
geom_point(aes(colour = n, size = n, alpha = 0.5))
accidents3 <- accidents %>%
group_by(week_day, hour) %>%
summarise(n = n())
accidents3
ggplot(accidents3, aes(week_day, hour)) +
geom_point(aes(colour = n, size = n, alpha = 0.5))
# Create a factor, to order the hours correctly (0-23) on the plot axis
# library(forcats)
# hour_levels <- c("0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21", "22", "23")
# y1 <- factor(hour, levels = hour_levels)
# Couldn't fix an error here. Instead converted `hour` in `accidents` from <chr> to numeric
# y1
# ggplot(accidents3, aes(week_day, y1)) +
# geom_point(aes(colour = n, size = n, alpha = 0.5))
#--
# ggplot(accidents3, aes(hour, n)) +
# geom_density(aes(colour = week_day)) +
# facet_wrap(~week_day)
# Error in eval(substitute(list(...)), `_data`, parent.frame()) : object 'y' not found
accidents4 <- accidents %>%
group_by(week_day, hour, localidad) %>% # error: grouping gives rings on viz not total
summarise(n = n())
accidents4
ggplot(accidents4, aes(week_day, hour)) +
geom_point(aes(colour = n, size = n, alpha = 0.5)) +
facet_wrap(~ localidad)
ggplot(accidents4, aes(week_day, hour)) +
geom_point(aes(colour = n, size = n, alpha = 0.5)) +
facet_wrap(~ month)
Error in combine_vars(data, params$plot_env, vars, drop = params$drop) :
At least one layer must contain all variables used for facetting
Try to create model of variation by hour. Start with localidad=ENGATIVA
accidents5 <- accidents %>%
group_by(hour) %>%
summarise(n = n())
accidents5
ggplot(accidents5, aes(hour, n)) +
geom_line()
ggplot(accidents3, aes(hour, n)) +
geom_line(aes(colour = week_day))
ggplot(accidents3, aes(hour, n)) +
geom_line(aes(colour = week_day)) +
facet_wrap(~week_day)
ggplot(accidents3, aes(hour, n)) +
geom_point(aes(colour = week_day)) +
facet_wrap(~week_day)
# ggplot(accidents4, aes(hour, n)) +
# geom_line(aes(colour = week_day)) +
# facet_grid(. ~ week_day)
# "Error in combine_vars(data, params$plot_env, cols, drop = params$drop) : At least one layer must contain all variables used for facetting""